1. Data

Load the student scores for the test - here we load the ETH Zurich test data, downloaded from https://pontifex.ethz.ch/s21t5/rate/

The scores are:

  • 0 if the student gave an incorrect response
  • 1 if the student gave a correct response
  • 2 if the student chose the “I don’t know” answer option
  • NA if there was no response recorded
test_scores
## # A tibble: 9,671 x 44
##    test_version year  class      A1_B1 A2_B0 A3_B2 A4_B3 A5_B4 A6_B5 A7_B6 A8_B7
##    <chr>        <chr> <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 pre          2017  s21t-000-~     1     0     1     1     1     0     1     0
##  2 pre          2017  s21t-000-~     1     0     1     1     1     1     0     1
##  3 pre          2017  s21t-000-~     1     0     0     0     1     1     1     0
##  4 pre          2017  s21t-000-~     1     0     1     1     1     1     1     1
##  5 pre          2017  s21t-000-~     1     0     1     0     2     0     1     0
##  6 pre          2017  s21t-000-~     0     1     0     0     1     2     0     2
##  7 pre          2017  s21t-000-~     1     0     1     0     2     1     0     1
##  8 pre          2017  s21t-000-~     1     1     1     1     2     1     1     2
##  9 pre          2017  s21t-000-~     1     1     0     1     1     1     1     1
## 10 pre          2017  s21t-000-~     1     0     1     0     0     1     0     0
## # ... with 9,661 more rows, and 33 more variables: A9_B8 <dbl>, A10_B9 <dbl>,
## #   A11_B10 <dbl>, A12_B0 <dbl>, A13_B0 <dbl>, A14_B12 <dbl>, A15_B13 <dbl>,
## #   A16_B14 <dbl>, A17_B15 <dbl>, A18_B16 <dbl>, A19_B0 <dbl>, A20_B17 <dbl>,
## #   A21_B18 <dbl>, A22_B19 <dbl>, A23_B20 <dbl>, A24_B21 <dbl>, A25_B0 <dbl>,
## #   A26_B22 <dbl>, A27_B23 <dbl>, A28_B24 <dbl>, A29_B25 <dbl>, A30_B0 <dbl>,
## #   A31_B27 <dbl>, A32_B28 <dbl>, A33_B29 <dbl>, A34_B0 <dbl>, A35_B0 <dbl>,
## #   A36_B0 <dbl>, A0_B11 <dbl>, A0_B26 <dbl>, A0_B30 <dbl>, A0_B31 <dbl>, ...

For this analysis, we replace the “2 = I don’t know” scores with NA, so that they will be treated as missing data.

test_scores <- test_scores %>% 
  mutate(across(starts_with("A"), ~ na_if(., 2))) %>%
  # A few students (23 out of 9671) gave "I don't know" as their answer to every question
  # Here we remove them, since rows with all NAs cause problems for mirt
  filter(!if_all(starts_with("A"), is.na))

TODO - consider mapping this to 0 instead, as it reflects being “not correct”

TODO - also try fitting a 3PL model to see if the “don’t know” responses reduce the guessing parameter

Data summary

The number of responses from each cohort:

test_scores %>% 
  group_by(year) %>% 
  tally() %>% 
  gt() %>% 
  data_color(
    columns = c("n"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  )
## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.
year n
2017 1678
2018 1746
2019 1995
2020 2188
2021 2041

Mean and standard deviation for each item:

test_scores %>% 
  select(-class, -test_version) %>% 
  group_by(year) %>% 
  skim_without_charts() %>% 
  select(-contains("character."), -contains("numeric.p"), -skim_type) %>% 
  rename(complete = complete_rate) %>% 
  # make the table wider, i.e. with separate columns for each year's results, with the year at the start of each column name
  pivot_wider(names_from = year, values_from = -c(skim_variable, year), names_glue = "{year}__{.value}") %>% 
  # put the columns in order by year
  select(sort(names(.))) %>% 
  select(skim_variable, everything()) %>% 
  # use GT to make the table look nice
  gt(rowname_col = "skim_variable") %>% 
  # group the columns from each year
  tab_spanner_delim(delim = "__") %>%
  fmt_number(columns = contains("numeric"), decimals = 2) %>%
  fmt_percent(columns = contains("complete"), decimals = 0) %>% 
  # change all the numeric.mean and numeric.sd column names to Mean and SD
  cols_label(
    .list = test_scores %>% select(year) %>% distinct() %>% transmute(col = paste0(year, "__numeric.mean"), label = "Mean") %>% deframe()
  ) %>% 
  cols_label(
    .list = test_scores %>% select(year) %>% distinct() %>% transmute(col = paste0(year, "__numeric.sd"), label = "SD") %>% deframe()
  ) %>%
  data_color(
    columns = contains("numeric.mean"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  )
2017 2018 2019 2020 2021
complete n_missing Mean SD complete n_missing Mean SD complete n_missing Mean SD complete n_missing Mean SD complete n_missing Mean SD
A1_B1 99% 15 0.96 0.20 99% 16 0.96 0.19 99% 12 0.96 0.19 99% 13 0.96 0.19 99% 16 0.96 0.19
A2_B0 92% 131 0.33 0.47 94% 97 0.30 0.46 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A3_B2 95% 77 0.65 0.48 98% 37 0.66 0.47 99% 29 0.68 0.47 98% 48 0.67 0.47 98% 35 0.68 0.47
A4_B3 98% 27 0.65 0.48 99% 23 0.63 0.48 98% 35 0.65 0.48 99% 31 0.63 0.48 98% 42 0.62 0.49
A5_B4 68% 544 0.69 0.46 75% 441 0.66 0.47 74% 513 0.66 0.47 73% 592 0.67 0.47 75% 513 0.66 0.47
A6_B5 78% 362 0.90 0.30 85% 269 0.87 0.34 85% 296 0.88 0.33 86% 299 0.89 0.32 85% 297 0.87 0.33
A7_B6 99% 19 0.72 0.45 99% 13 0.71 0.45 99% 25 0.70 0.46 99% 20 0.71 0.45 99% 16 0.69 0.46
A8_B7 81% 311 0.58 0.49 89% 199 0.56 0.50 90% 195 0.58 0.49 87% 283 0.69 0.46 87% 261 0.67 0.47
A9_B8 63% 613 0.73 0.44 69% 549 0.67 0.47 69% 628 0.67 0.47 70% 661 0.62 0.48 70% 606 0.61 0.49
A10_B9 73% 452 0.74 0.44 79% 367 0.69 0.46 78% 443 0.71 0.45 78% 481 0.71 0.46 77% 474 0.69 0.46
A11_B10 79% 352 0.73 0.44 83% 293 0.70 0.46 85% 299 0.68 0.46 84% 354 0.71 0.45 82% 360 0.69 0.46
A12_B0 87% 216 0.78 0.41 93% 127 0.77 0.42 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A13_B0 87% 211 0.43 0.50 89% 188 0.40 0.49 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A14_B12 86% 237 0.65 0.48 89% 192 0.65 0.48 89% 229 0.66 0.47 88% 253 0.61 0.49 87% 260 0.62 0.49
A15_B13 84% 269 0.55 0.50 87% 220 0.53 0.50 86% 278 0.55 0.50 87% 279 0.52 0.50 87% 267 0.52 0.50
A16_B14 75% 427 0.25 0.43 80% 341 0.25 0.43 81% 372 0.25 0.43 81% 410 0.24 0.43 80% 404 0.25 0.43
A17_B15 91% 154 0.65 0.48 93% 120 0.65 0.48 93% 133 0.64 0.48 94% 139 0.63 0.48 94% 122 0.61 0.49
A18_B16 72% 468 0.53 0.50 79% 372 0.46 0.50 76% 487 0.37 0.48 78% 492 0.35 0.48 78% 449 0.36 0.48
A19_B0 66% 566 0.53 0.50 73% 468 0.47 0.50 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A20_B17 94% 108 0.78 0.42 95% 85 0.76 0.43 95% 101 0.78 0.42 95% 100 0.74 0.44 95% 105 0.75 0.43
A21_B18 76% 408 0.67 0.47 81% 336 0.65 0.48 82% 368 0.66 0.47 82% 401 0.63 0.48 80% 405 0.64 0.48
A22_B19 72% 477 0.72 0.45 76% 420 0.68 0.47 77% 453 0.67 0.47 78% 481 0.66 0.47 77% 479 0.65 0.48
A23_B20 71% 492 0.58 0.49 77% 397 0.56 0.50 78% 439 0.54 0.50 77% 512 0.55 0.50 77% 461 0.55 0.50
A24_B21 81% 326 0.73 0.44 85% 260 0.68 0.47 87% 268 0.69 0.46 86% 313 0.66 0.47 84% 325 0.68 0.47
A25_B0 95% 78 0.57 0.50 97% 46 0.75 0.43 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A26_B22 85% 250 0.94 0.24 87% 230 0.93 0.26 87% 261 0.93 0.26 87% 276 0.93 0.26 87% 259 0.92 0.27
A27_B23 65% 590 0.60 0.49 70% 531 0.58 0.49 71% 580 0.60 0.49 71% 635 0.56 0.50 70% 609 0.59 0.49
A28_B24 61% 647 0.69 0.46 70% 529 0.65 0.48 70% 600 0.65 0.48 68% 710 0.62 0.49 68% 646 0.60 0.49
A29_B25 78% 373 0.62 0.48 82% 319 0.63 0.48 83% 344 0.61 0.49 81% 409 0.60 0.49 81% 387 0.62 0.49
A30_B0 91% 145 0.83 0.38 93% 121 0.79 0.41 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A31_B27 87% 214 0.43 0.50 90% 166 0.41 0.49 90% 192 0.44 0.50 88% 268 0.46 0.50 86% 280 0.47 0.50
A32_B28 72% 466 0.32 0.46 75% 436 0.27 0.45 78% 446 0.26 0.44 76% 529 0.27 0.45 76% 485 0.27 0.44
A33_B29 97% 57 0.81 0.39 96% 63 0.79 0.41 97% 69 0.80 0.40 97% 76 0.76 0.43 96% 79 0.78 0.41
A34_B0 70% 497 0.83 0.38 76% 425 0.81 0.39 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A35_B0 47% 886 0.69 0.46 54% 797 0.61 0.49 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A36_B0 51% 830 0.44 0.50 58% 741 0.40 0.49 0% 1995 NaN NA 0% 2188 NaN NA 0% 2041 NaN NA
A0_B11 0% 1678 NaN NA 0% 1746 NaN NA 94% 124 0.61 0.49 94% 123 0.63 0.48 95% 106 0.61 0.49
A0_B26 0% 1678 NaN NA 0% 1746 NaN NA 88% 240 0.61 0.49 87% 288 0.62 0.49 86% 290 0.61 0.49
A0_B30 0% 1678 NaN NA 0% 1746 NaN NA 79% 420 0.77 0.42 79% 464 0.77 0.42 78% 451 0.77 0.42
A0_B31 0% 1678 NaN NA 0% 1746 NaN NA 69% 618 0.50 0.50 69% 676 0.47 0.50 75% 519 0.60 0.49
A0_B32 0% 1678 NaN NA 0% 1746 NaN NA 41% 1179 0.49 0.50 42% 1266 0.51 0.50 43% 1154 0.49 0.50

2. Testing assumptions

Before applying IRT, we should check that the data satisfies the assumptions needed by the model. In particular, to use a 1-dimensional IRT model, we should have some evidence of unidimensionality in the test scores.

Local independence

This plot shows the correlations between scores on each pair of items – note that it is restricted to only those items that appear on both versions of the test, since the plotting package did not deal well with missing data:

TODO - check with Mine that this is a valid way to proceed (throwing out questions with missing data)

item_scores <- test_scores %>% 
  select(-class, -year, -test_version)

item_scores_unchanged_only <- item_scores %>% 
  select(!contains("B0")) %>% select(!contains("A0"))

cor_ci <- psych::corCi(item_scores_unchanged_only, plot = FALSE)

psych::cor.plot.upperLowerCi(cor_ci)

TODO - add some notes here to explain how to interpret all of this!

There are a few correlations that are not significantly different from 0:

cor_ci$ci %>% 
  as_tibble(rownames = "corr") %>% 
  filter(p > 0.05) %>% 
  arrange(-p) %>% 
  select(-contains(".e")) %>% 
  gt() %>% 
  fmt_number(columns = 2:4, decimals = 3)
corr lower upper p
A1_B1-A29_B −0.017 0.031 0.593
A29_B-A31_B −0.035 0.009 0.251
A24_B-A29_B −0.010 0.038 0.244

Here we redo the correlation calculations with all the items, and check that there are still few cases where the correlations close to 0:

cor_ci2 <- psych::corCi(item_scores, plot = FALSE)
cor_ci2$ci %>% 
  as_tibble(rownames = "corr") %>% 
  filter(p > 0.05) %>% 
  arrange(-p) %>% 
  select(-contains(".e")) %>% 
  gt() %>% 
  fmt_number(columns = 2:4, decimals = 3)
corr lower upper p
A1_B1-A29_B −0.017 0.027 0.656
A29_B-A0_B30 −0.021 0.042 0.501
A2_B0-A34_B −0.018 0.065 0.265
A29_B-A31_B −0.035 0.009 0.246
A1_B1-A2_B0 −0.012 0.056 0.203
A2_B0-A25_B −0.008 0.065 0.131
A1_B1-A12_B −0.008 0.077 0.109
A24_B-A29_B −0.003 0.039 0.096
A12_B-A34_B −0.004 0.085 0.074
A29_B-A0_B2 −0.002 0.048 0.066

The overall picture is that the item scores are well correlated with each other.

Dimensionality

Here we again focus on the subset of items that appeared in both tests.

structure <- check_factorstructure(item_scores_unchanged_only)
n <- n_factors(item_scores_unchanged_only)

Is the data suitable for Factor Analysis?

  • KMO: The Kaiser, Meyer, Olkin (KMO) measure of sampling adequacy suggests that data seems appropriate for factor analysis (KMO = 0.95).
  • Sphericity: Bartlett’s test of sphericity suggests that there is sufficient significant correlation in the data for factor analysis (Chisq(351) = 39555.38, p < .001).

Method Agreement Procedure:

The choice of 1 dimensions is supported by 6 (26.09%) methods out of 23 (Acceleration factor, R2, VSS complexity 1, Velicer’s MAP, TLI, RMSEA).

plot(n)

summary(n) %>% gt()
n_Factors n_Methods
1 6
2 2
3 1
4 6
7 1
10 2
17 1
20 1
26 3
#n %>% tibble() %>% gt()
fa.parallel(item_scores_unchanged_only, fa = "fa")

## Parallel analysis suggests that the number of factors =  8  and the number of components =  NA

1 Factor

item_scores_unchanged_only_and_no_na <- item_scores_unchanged_only %>%
  mutate(
    across(everything(), ~replace_na(.x, 0))
  )
fitfact <- factanal(item_scores_unchanged_only_and_no_na, factors = 1, rotation = "varimax")
print(fitfact, digits = 2, cutoff = 0.3, sort = TRUE)
## 
## Call:
## factanal(x = item_scores_unchanged_only_and_no_na, factors = 1,     rotation = "varimax")
## 
## Uniquenesses:
##   A1_B1   A3_B2   A4_B3   A5_B4   A6_B5   A7_B6   A8_B7   A9_B8  A10_B9 A11_B10 
##    0.96    0.77    0.80    0.72    0.77    0.78    0.78    0.62    0.65    0.69 
## A14_B12 A15_B13 A16_B14 A17_B15 A18_B16 A20_B17 A21_B18 A22_B19 A23_B20 A24_B21 
##    0.72    0.88    0.82    0.79    0.84    0.64    0.59    0.61    0.62    0.74 
## A26_B22 A27_B23 A28_B24 A29_B25 A31_B27 A32_B28 A33_B29 
##    0.70    0.64    0.64    0.87    0.77    0.80    0.88 
## 
## Loadings:
##  [1] 0.52 0.62 0.59 0.56 0.53 0.60 0.64 0.63 0.61 0.51 0.55 0.60 0.60      0.48
## [16] 0.45 0.48 0.47 0.46 0.34 0.42 0.46 0.40 0.36 0.47 0.45 0.35
## 
##                Factor1
## SS loadings       6.90
## Proportion Var    0.26
## 
## Test of the hypothesis that 1 factor is sufficient.
## The chi square statistic is 4407.69 on 324 degrees of freedom.
## The p-value is 0
load <- tidy(fitfact)

ggplot(load, aes(x = fl1, y = 0)) + 
  geom_point() + 
  geom_label_repel(aes(label = paste0("A", rownames(load))), show.legend = FALSE) +
  labs(x = "Factor 1", y = NULL,
       title = "Standardised Loadings", 
       subtitle = "Based upon correlation matrix") +
  theme_minimal()
## Warning: ggrepel: 11 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

load %>% 
  select(question = variable, factor_loading = fl1) %>% 
  left_join(test_versions %>% select(question = label, description), by = "question") %>% 
  arrange(-factor_loading) %>% 
  gt() %>%
  data_color(
    columns = contains("factor"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  )
question factor_loading description
A21_B18 0.6435130 (ln(sin))'
A22_B19 0.6254470 hyp.functions
A9_B8 0.6168320 trig.ineq.
A23_B20 0.6140785 slope tangent
A27_B23 0.6002246 int(exp)
A20_B17 0.5969677 (exp)'
A28_B24 0.5963202 Int = 0
A10_B9 0.5879543 trig.identity
A11_B10 0.5569525 period
A26_B22 0.5471497 int(poly)
A14_B12 0.5323281 limit
A5_B4 0.5249438 logs
A24_B21 0.5063266 IVT
A6_B5 0.4835041 logs
A3_B2 0.4808511 arithmetic rules
A31_B27 0.4744211 int(abs)
A7_B6 0.4734288 graph translation
A8_B7 0.4638774 sine pi/3
A17_B15 0.4602994 graph f'
A32_B28 0.4521843 FtoC algebra
A4_B3 0.4460368 Easy ineq.
A16_B14 0.4232210 diff.quotient
A18_B16 0.3956478 product rule
A29_B25 0.3565591 int even funct
A33_B29 0.3468449 difference vectors
A15_B13 0.3444530 series
A1_B1 0.2081748 linear function
# To proceed with the IRT analysis, comment out the following line before knitting
#knitr::knit_exit()

3. Fitting 2 parameter logistic MIRT model

We can fit a Multidimensional Item Response Theory (mirt) model. From the function definition:

mirt fits a maximum likelihood (or maximum a posteriori) factor analysis model to any mixture of dichotomous and polytomous data under the item response theory paradigm using either Cai's (2010) Metropolis-Hastings Robbins-Monro (MHRM) algorithm.

The process is to first fit the model, and save the result as a model object that we can then parse to get tabular or visual displays of the model that we might want. When fitting the model, we have the option to specify a few arguments, which then get interpreted as parameters to be passed to the model.

fit_2pl <- mirt(
  data = item_scores, # just the columns with question scores
  #removeEmptyRows = TRUE, 
  model = 1,          # number of factors to extract
  itemtype = "2PL",   # 2 parameter logistic model
  SE = TRUE           # estimate standard errors
  )
## 
Iteration: 1, Log-Lik: -153303.326, Max-Change: 3.43983
Iteration: 2, Log-Lik: -144537.585, Max-Change: 2.83797
Iteration: 3, Log-Lik: -143119.698, Max-Change: 0.56850
Iteration: 4, Log-Lik: -142603.679, Max-Change: 0.32841
Iteration: 5, Log-Lik: -142319.878, Max-Change: 0.19091
Iteration: 6, Log-Lik: -142121.621, Max-Change: 0.24745
Iteration: 7, Log-Lik: -141985.177, Max-Change: 0.14796
Iteration: 8, Log-Lik: -141875.725, Max-Change: 0.18103
Iteration: 9, Log-Lik: -141796.485, Max-Change: 0.10819
Iteration: 10, Log-Lik: -141728.929, Max-Change: 0.14340
Iteration: 11, Log-Lik: -141681.982, Max-Change: 0.08343
Iteration: 12, Log-Lik: -141639.306, Max-Change: 0.10933
Iteration: 13, Log-Lik: -141610.936, Max-Change: 0.04694
Iteration: 14, Log-Lik: -141583.493, Max-Change: 0.07738
Iteration: 15, Log-Lik: -141566.377, Max-Change: 0.04283
Iteration: 16, Log-Lik: -141550.332, Max-Change: 0.06029
Iteration: 17, Log-Lik: -141539.561, Max-Change: 0.03012
Iteration: 18, Log-Lik: -141529.112, Max-Change: 0.04121
Iteration: 19, Log-Lik: -141522.131, Max-Change: 0.02231
Iteration: 20, Log-Lik: -141515.911, Max-Change: 0.03346
Iteration: 21, Log-Lik: -141511.487, Max-Change: 0.01938
Iteration: 22, Log-Lik: -141507.693, Max-Change: 0.02355
Iteration: 23, Log-Lik: -141504.506, Max-Change: 0.01518
Iteration: 24, Log-Lik: -141501.657, Max-Change: 0.01769
Iteration: 25, Log-Lik: -141499.666, Max-Change: 0.01220
Iteration: 26, Log-Lik: -141497.920, Max-Change: 0.00819
Iteration: 27, Log-Lik: -141496.452, Max-Change: 0.00703
Iteration: 28, Log-Lik: -141493.651, Max-Change: 0.00556
Iteration: 29, Log-Lik: -141493.279, Max-Change: 0.00409
Iteration: 30, Log-Lik: -141493.000, Max-Change: 0.00348
Iteration: 31, Log-Lik: -141492.226, Max-Change: 0.00217
Iteration: 32, Log-Lik: -141492.197, Max-Change: 0.00094
Iteration: 33, Log-Lik: -141492.187, Max-Change: 0.00075
Iteration: 34, Log-Lik: -141492.166, Max-Change: 0.00060
Iteration: 35, Log-Lik: -141492.160, Max-Change: 0.00046
Iteration: 36, Log-Lik: -141492.156, Max-Change: 0.00029
Iteration: 37, Log-Lik: -141492.155, Max-Change: 0.00022
Iteration: 38, Log-Lik: -141492.153, Max-Change: 0.00016
Iteration: 39, Log-Lik: -141492.152, Max-Change: 0.00014
Iteration: 40, Log-Lik: -141492.147, Max-Change: 0.00057
Iteration: 41, Log-Lik: -141492.145, Max-Change: 0.00014
Iteration: 42, Log-Lik: -141492.145, Max-Change: 0.00045
Iteration: 43, Log-Lik: -141492.143, Max-Change: 0.00039
Iteration: 44, Log-Lik: -141492.143, Max-Change: 0.00021
Iteration: 45, Log-Lik: -141492.143, Max-Change: 0.00010
Iteration: 46, Log-Lik: -141492.142, Max-Change: 0.00026
Iteration: 47, Log-Lik: -141492.142, Max-Change: 0.00022
Iteration: 48, Log-Lik: -141492.142, Max-Change: 0.00010
Iteration: 49, Log-Lik: -141492.142, Max-Change: 0.00009
## 
## Calculating information matrix...

Local independence

We compute Yen’s \(Q_3\) (1984) to check for any dependence between items after controlling for \(\theta\). This gives a score for each pair of items, with scores above 0.2 regarded as problematic (see DeMars, p. 48).

residuals_2pl  %>% as.matrix() %>% 
  corrplot::corrplot(type = "upper")

This shows that most item pairs are independent, with only one pair showing cause for concern:

residuals_2pl %>%
  rownames_to_column(var = "q1") %>%
  as_tibble() %>% 
  pivot_longer(cols = starts_with("A"), names_to = "q2", values_to = "Q3_score") %>% 
  filter(abs(Q3_score) > 0.2) %>% 
  filter(parse_number(q1) < parse_number(q2)) %>%
  gt()
q1 q2 Q3_score
A18_B16 A19_B0 0.6718485

Items A18 and A19 are on the product and quotient rules.

Given that this violation of the local independence assumption is very mild (particularly as A19 is one of the questions removed from the test), we proceed using this model.

Model parameters

We then compute factor score estimates and augment the existing data frame with these estimates, to keep everything in one place. To do the estimation, we use the fscores() function from the mirt package which takes in a computed model object and computes factor score estimates according to the method specified. We will use the EAP method for factor score estimation, which is the “expected a-posteriori” method, the default. We specify it explicitly below, but the results would have been the same if we omitted specifying the method argument since it’s the default method the function uses.

test_scores <- test_scores %>%
  mutate(F1 = fscores(fit_2pl, method = "EAP"))

We can also calculate the model coefficient estimates using a generic function coef() which is used to extract model coefficients from objects returned by modeling functions. We will set the IRTpars argument to TRUE, which means slope intercept parameters will be converted into traditional IRT parameters.

coefs_2pl <- coef(fit_2pl, IRTpars = TRUE)

The resulting object coefs is a list, with one element for each question, and an additional GroupPars element that we won’t be using. The output is a bit long, so we’re only showing a few of the elements here:

coefs_2pl[1:3]
## $A1_B1
##                 a         b  g  u
## par     1.1074475 -3.394517  0  1
## CI_2.5  0.9624536 -3.743726 NA NA
## CI_97.5 1.2524414 -3.045309 NA NA
## 
## $A2_B0
##                 a        b  g  u
## par     0.6008349 1.446187  0  1
## CI_2.5  0.5104951 1.222858 NA NA
## CI_97.5 0.6911746 1.669515 NA NA
## 
## $A3_B2
##                a          b  g  u
## par     1.313206 -0.7087592  0  1
## CI_2.5  1.237460 -0.7577961 NA NA
## CI_97.5 1.388952 -0.6597223 NA NA
# coefs_2pl[35:37]

Let’s take a closer look at the first element:

coefs_2pl[1]
## $A1_B1
##                 a         b  g  u
## par     1.1074475 -3.394517  0  1
## CI_2.5  0.9624536 -3.743726 NA NA
## CI_97.5 1.2524414 -3.045309 NA NA

In this output:

  • a is discrimination
  • b is difficulty
  • endpoints of the 95% confidence intervals are also shown

To make this output a little more user friendly, we should tidy it such that we have a row per question. We’ll do this in two steps. First, write a function that tidies the output for one question, i.e. one list element. Then, map this function over the list of all questions, resulting in a data frame.

tidy_mirt_coefs <- function(x){
  x %>%
    # melt the list element
    melt() %>%
    # convert to a tibble
    as_tibble() %>%
    # convert factors to characters
    mutate(across(where(is.factor), as.character)) %>%
    # only focus on rows where X2 is a or b (discrimination or difficulty)
    filter(X2 %in% c("a", "b")) %>%
    # in X1, relabel par (parameter) as est (estimate)
    mutate(X1 = if_else(X1 == "par", "est", X1)) %>%
    # unite columns X2 and X1 into a new column called var separated by _
    unite(X2, X1, col = "var", sep = "_") %>%
    # turn into a wider data frame
    pivot_wider(names_from = var, values_from = value)
}

Let’s see what this does to a single element in coefs:

tidy_mirt_coefs(coefs_2pl[1])
## # A tibble: 1 x 7
##   L1    a_est a_CI_2.5 a_CI_97.5 b_est b_CI_2.5 b_CI_97.5
##   <chr> <dbl>    <dbl>     <dbl> <dbl>    <dbl>     <dbl>
## 1 A1_B1  1.11    0.962      1.25 -3.39    -3.74     -3.05

And now let’s map it over all elements of coefs:

# use head(., -1) to remove the last element, `GroupPars`, which does not correspond to a question
tidy_2pl <- map_dfr(head(coefs_2pl, -1), tidy_mirt_coefs, .id = "Question") %>% 
  left_join(test_versions, by = c("Question" = "label"))

A quick peek at the result:

tidy_2pl
## # A tibble: 41 x 12
##    Question a_est a_CI_2.5 a_CI_97.5  b_est b_CI_2.5 b_CI_97.5 pre   post 
##    <glue>   <dbl>    <dbl>     <dbl>  <dbl>    <dbl>     <dbl> <chr> <chr>
##  1 A1_B1    1.11     0.962     1.25  -3.39    -3.74     -3.05  A1    B1   
##  2 A2_B0    0.601    0.510     0.691  1.45     1.22      1.67  A2    <NA> 
##  3 A3_B2    1.31     1.24      1.39  -0.709   -0.758    -0.660 A3    B2   
##  4 A4_B3    1.26     1.18      1.33  -0.585   -0.633    -0.538 A4    B3   
##  5 A5_B4    1.04     0.961     1.11  -0.614   -0.683    -0.544 A5    B4   
##  6 A6_B5    0.955    0.863     1.05  -2.28    -2.47     -2.09  A6    B5   
##  7 A7_B6    1.43     1.34      1.51  -0.842   -0.891    -0.793 A7    B6   
##  8 A8_B7    1.01     0.949     1.08  -0.538   -0.597    -0.480 A8    B7   
##  9 A9_B8    1.63     1.53      1.74  -0.343   -0.389    -0.296 A9    B8   
## 10 A10_B9   1.41     1.32      1.50  -0.689   -0.745    -0.634 A10   B9   
## # ... with 31 more rows, and 3 more variables: description <chr>, notes <chr>,
## #   outcome <chr>

And a nicely formatted table of the result:

tidy_2pl %>%
  select(-pre,-post,-notes) %>% 
  group_by(outcome) %>% 
  gt() %>% 
  fmt_number(columns = contains("a_"), decimals = 2) %>%
  fmt_number(columns = contains("b_"), decimals = 2) %>%
  data_color(
    columns = contains("a_"),
    colors = scales::col_numeric(palette = c("Greens"), domain = NULL)
  ) %>%
  data_color(
    columns = contains("b_"),
    colors = scales::col_numeric(palette = c("Blues"), domain = NULL)
  ) %>%
  tab_spanner(label = "Discrimination", columns = contains("a_")) %>%
  tab_spanner(label = "Difficulty", columns = contains("b_")) %>%
  cols_label(
    a_est = "Est.",
    b_est = "Est.",
    a_CI_2.5 = "2.5%",
    b_CI_2.5 = "2.5%",
    a_CI_97.5 = "97.5%",
    b_CI_97.5 = "97.5%"
  )
Question Discrimination Difficulty description
Est. 2.5% 97.5% Est. 2.5% 97.5%
unchanged
A1_B1 1.11 0.96 1.25 −3.39 −3.74 −3.05 linear function
A3_B2 1.31 1.24 1.39 −0.71 −0.76 −0.66 arithmetic rules
A4_B3 1.26 1.18 1.33 −0.59 −0.63 −0.54 Easy ineq.
A5_B4 1.04 0.96 1.11 −0.61 −0.68 −0.54 logs
A6_B5 0.96 0.86 1.05 −2.28 −2.47 −2.09 logs
A7_B6 1.43 1.34 1.51 −0.84 −0.89 −0.79 graph translation
A8_B7 1.01 0.95 1.08 −0.54 −0.60 −0.48 sine pi/3
A9_B8 1.63 1.53 1.74 −0.34 −0.39 −0.30 trig.ineq.
A10_B9 1.41 1.32 1.50 −0.69 −0.74 −0.63 trig.identity
A11_B10 1.30 1.22 1.39 −0.74 −0.80 −0.68 period
A14_B12 1.36 1.28 1.44 −0.50 −0.55 −0.45 limit
A15_B13 0.98 0.91 1.04 −0.20 −0.25 −0.14 series
A16_B14 1.25 1.17 1.33 1.20 1.13 1.27 diff.quotient
A17_B15 1.20 1.13 1.28 −0.55 −0.60 −0.50 graph f'
A18_B16 0.87 0.81 0.94 0.60 0.54 0.67 product rule
A20_B17 1.81 1.70 1.91 −0.96 −1.00 −0.91 (exp)'
A21_B18 1.64 1.54 1.73 −0.42 −0.46 −0.37 (ln(sin))'
A22_B19 1.63 1.54 1.73 −0.51 −0.56 −0.47 hyp.functions
A23_B20 1.43 1.34 1.51 −0.02 −0.06 0.02 slope tangent
A24_B21 1.23 1.15 1.31 −0.71 −0.77 −0.66 IVT
A26_B22 1.63 1.48 1.77 −2.00 −2.12 −1.88 int(poly)
A27_B23 1.37 1.28 1.45 −0.10 −0.15 −0.06 int(exp)
A28_B24 1.29 1.20 1.38 −0.34 −0.39 −0.28 Int = 0
A29_B25 0.32 0.27 0.37 −1.38 −1.66 −1.10 int even funct
A31_B27 1.14 1.07 1.21 0.32 0.27 0.37 int(abs)
A32_B28 1.18 1.09 1.26 1.19 1.12 1.26 FtoC algebra
A33_B29 0.93 0.86 1.00 −1.62 −1.73 −1.51 difference vectors
removed
A2_B0 0.60 0.51 0.69 1.45 1.22 1.67 3D
A12_B0 0.78 0.67 0.89 −1.72 −1.95 −1.49 rational exponents
A13_B0 0.77 0.67 0.86 0.57 0.45 0.68 ellipsoid
A19_B0 1.07 0.95 1.20 0.16 0.07 0.25 quotient rule
A25_B0 0.44 0.36 0.53 −1.56 −1.89 −1.24 velocity
A30_B0 1.16 1.03 1.30 −1.45 −1.60 −1.31 FtoC graph
A34_B0 0.60 0.48 0.72 −2.56 −3.06 −2.06 normal vector
A35_B0 0.98 0.85 1.12 −0.48 −0.61 −0.35 intersect planes
A36_B0 0.93 0.80 1.05 0.65 0.53 0.77 parallel planes
added
A0_B11 0.74 0.67 0.81 −0.72 −0.82 −0.63 rational exponents
A0_B26 0.94 0.86 1.02 −0.53 −0.60 −0.45 FtoC graph
A0_B30 0.52 0.44 0.59 −2.41 −2.78 −2.05 normal vector
A0_B31 0.75 0.67 0.82 −0.06 −0.15 0.03 vector product
A0_B32 1.23 1.11 1.34 0.15 0.07 0.22 scalar product
tidy_2pl %>% 
  write_csv("data-eth/OUT_2pl-results.csv")
tidy_2pl %>% 
  mutate(qnum = parse_number(Question)) %>% 
  ggplot(aes(x = qnum, y = b_est, label = Question)) +
  geom_errorbar(aes(ymin = b_CI_2.5, ymax = b_CI_97.5), width = 0.2) +
  geom_text(colour = "grey") +
  geom_point() +
  theme_minimal() +
  labs(x = "Question",
       y = "Difficulty")

This shows the difficulty and discrimination of each of the questions, highlighting those that were added or removed:

tidy_2pl %>% 
  mutate(qnum = parse_number(Question)) %>% 
  ggplot(aes(
    x = a_est,
    y = b_est,
    label = case_when(
      outcome == "unchanged" ~ "",
      outcome == "removed" ~ pre,
      outcome == "added" ~ post
    ),
    colour = outcome
  )) +
  geom_errorbar(aes(ymin = b_CI_2.5, ymax = b_CI_97.5), width = 0, alpha = 0.5) +
  geom_errorbar(aes(xmin = a_CI_2.5, xmax = a_CI_97.5), width = 0, alpha = 0.5) +
  geom_text_repel() +
  geom_point() +
  theme_minimal() +
  labs(x = "Discrimination",
       y = "Difficulty")

Comparing years and classes

Do students from different programmes of study have different distributions of ability?

Differences between years

Compare the distribution of abilities in the year groups (though in this case there is only one).

ggplot(test_scores, aes(F1, y = year, fill = as.factor(year), colour = as.factor(year))) +
  geom_density_ridges(alpha=0.5) + 
  scale_x_continuous(limits = c(-3.5,3.5)) +
  labs(title = "Density plot", 
       subtitle = "Ability grouped by year of taking the test", 
       x = "Ability", y = "Density",
       fill = "Year", colour = "Year") +
  theme_minimal()
## Picking joint bandwidth of 0.182

Information curves

Test information curve

plot(fit_2pl, type = "infoSE", main = "Test information")

Item information curves

plot(fit_2pl, type = "infotrace", main = "Item information curves")

Response curves

Test response curves

plot(fit_2pl, type = "score", auto.key = FALSE)

Item response curves

We can get individual item surface and information plots using the itemplot() function from the mirt package, e.g.

mirt::itemplot(fit_2pl, item = 1, 
               main = "Trace lines for item 1")

We can also get the plots for all trace lines, one facet per plot.

plot(fit_2pl, type = "trace", auto.key = FALSE)

Or all of them overlaid in one plot.

plot(fit_2pl, type = "trace", facet_items=FALSE)

An alternative approach is using ggplot2 and plotly to add interactivity to make it easier to identify items.

# store the object
plt <- plot(fit_2pl, type = "trace", facet_items = FALSE)
# the data we need is in panel.args
# TODO - I had to change the [[1]] to [[2]] since the plt has two panels for some reason, with the one we want being the 2nd panel
plt_data <- tibble(
  x          = plt$panel.args[[2]]$x,
  y          = plt$panel.args[[2]]$y,
  subscripts = plt$panel.args[[2]]$subscripts,
  item       = rep(colnames(item_scores), each = 200)
) %>%
  mutate(
    item_no = str_remove(item, "A") %>% as.numeric(),
    item    = fct_reorder(item, item_no)
    )

head(plt_data)
## # A tibble: 6 x 5
##       x      y subscripts item  item_no
##   <dbl>  <dbl>      <int> <fct>   <dbl>
## 1 -6    0.0529        201 A1_B1      NA
## 2 -5.94 0.0563        202 A1_B1      NA
## 3 -5.88 0.0600        203 A1_B1      NA
## 4 -5.82 0.0639        204 A1_B1      NA
## 5 -5.76 0.0680        205 A1_B1      NA
## 6 -5.70 0.0723        206 A1_B1      NA
plt_gg <- ggplot(plt_data, aes(x, y, 
                          colour = item, 
                          text = item)) + 
  geom_line() + 
  labs(
    title = "2PL - Trace lines",
    #x = expression(theta),
    x = "theta",
    #y = expression(P(theta)),
    y = "P(theta)",
    colour = "Item"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

ggplotly(plt_gg, tooltip = "text")
knitr::knit_exit()